{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Script for converting a tree of life from the Tree of Life Project in xml to a networkx graph and to a graphml file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This script convert a Tree of life from an xml format to a networkx graph then to several file formats (json, graphml).\n",
    "The original data can be found here http://tolweb.org/tree/home.pages/downloadtree.html\n",
    "The xml file available on the above website is licenced under the Attribution Creative Commons 3.0 https://creativecommons.org/licenses/by/3.0/, the copyright is owned by the Tree Of Life Project.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright 2017 Benjamin Ricaud\n",
    "\n",
    "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "you may not use this file except in compliance with the License.\n",
    "You may obtain a copy of the License at\n",
    "\n",
    "    http://www.apache.org/licenses/LICENSE-2.0\n",
    "\n",
    "Unless required by applicable law or agreed to in writing, software\n",
    "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "See the License for the specific language governing permissions and\n",
    "limitations under the License."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Tools for parsing xml\n",
    "import xml.etree.ElementTree as ET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Load the xml file\n",
    "# If there are error during the loading, make sure the file is encoded in UTF8\n",
    "# You may have to open it with a text editor and save it with encoding UTF8\n",
    "xml_file_to_load = 'tolskeletaldumpUTF8.xml'\n",
    "tree = ET.parse(xml_file_to_load)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# The data will be loaded in a networkx graph\n",
    "# The networkx module can be installed using 'pip install networkx'\n",
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of nodes processed: 35960\n",
      "Number of nodes in the graph: 35960\n",
      "Number of edges in the graph: 35959\n",
      "The graph is a tree? True\n"
     ]
    }
   ],
   "source": [
    "# Code for the tree construction\n",
    "i=1\n",
    "G = nx.DiGraph()\n",
    "root = tree.getroot()\n",
    "for livingElement in root.iter('NODE'):\n",
    "    name = livingElement.find('NAME').text\n",
    "    data_dic = livingElement.attrib\n",
    "    node_id = data_dic['ID']\n",
    "    if name == None:\n",
    "        name = 'None'\n",
    "    data_dic['name'] = name\n",
    "    if not G.has_node(node_id):\n",
    "        G.add_node(node_id,data_dic)\n",
    "    if data_dic['CHILDCOUNT']!='0':\n",
    "        for child in livingElement[1]:\n",
    "            child_name = child.find('NAME').text\n",
    "            child_data_dic = child.attrib\n",
    "            child_id = child_data_dic['ID']\n",
    "            if child_name == None:\n",
    "                child_name = 'None'\n",
    "            child_data_dic['name'] = child_name\n",
    "            #print(child_name,child_data_dic)\n",
    "            if not G.has_node(child_id):\n",
    "                G.add_node(child_id,child_data_dic)\n",
    "            if G.has_edge(node_id,child_id):\n",
    "                print('found exisiting edge',name,child_name)\n",
    "                print('data: ',data_dic,child_data_dic)\n",
    "            G.add_edge(node_id,child_id,weight=1)\n",
    "            i+=1\n",
    "print('Number of nodes processed:',i)\n",
    "print('Number of nodes in the graph:',G.number_of_nodes())\n",
    "print('Number of edges in the graph:',G.number_of_edges())\n",
    "print('The graph is a tree?',nx.is_tree(G))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Root node id: 1\n"
     ]
    }
   ],
   "source": [
    "# Find the root node, the only one that has in_degree 0\n",
    "root_node_list = [n for n,d in G.in_degree().items() if d==0] \n",
    "root_node_id = root_node_list[0]\n",
    "print('Root node id:',root_node_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'HASPAGE': '2', 'CHILDCOUNT': '4', 'CONFIDENCE': '0', 'EXTINCT': '0', 'LEAF': '0', 'name': 'Life on Earth', 'ID': '1', 'PHYLESIS': '0'}\n",
      "Degree: 4\n",
      "Successors:  ['Eubacteria', 'Eukaryotes', 'Viruses', 'Archaea']\n"
     ]
    }
   ],
   "source": [
    "# Details about the root node\n",
    "print(G.node[root_node_id])\n",
    "print('Degree:',G.degree(root_node_id))\n",
    "print('Successors: ',[G.node[node]['name'] for node in G.successors(root_node_id)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Saving the graph in json format\n",
    "from networkx.readwrite import json_graph\n",
    "import json\n",
    "with open('treeoflife.json', 'w') as outfile1:\n",
    "    outfile1.write(json.dumps(json_graph.node_link_data(G)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Saving the graph in graphML format\n",
    "nx.write_graphml(G, \"treeoflife.graphml\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See https://networkx.github.io/ for more file formats and additional details on the handling of the graph."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
